#Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import calendar
import plotly.express as px
#Read the dataframe
df = pd.read_csv("us_tornado_dataset_1950_2021.csv")
df.head()
| yr | mo | dy | date | st | mag | inj | fat | slat | slon | elat | elon | len | wid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 1 | 3 | 1950-01-03 | IL | 3 | 3 | 0 | 39.10 | -89.30 | 39.12 | -89.23 | 3.6 | 130 |
| 1 | 1950 | 1 | 3 | 1950-01-03 | MO | 3 | 3 | 0 | 38.77 | -90.22 | 38.83 | -90.03 | 9.5 | 150 |
| 2 | 1950 | 1 | 3 | 1950-01-03 | OH | 1 | 1 | 0 | 40.88 | -84.58 | 0.00 | 0.00 | 0.1 | 10 |
| 3 | 1950 | 1 | 13 | 1950-01-13 | AR | 3 | 1 | 1 | 34.40 | -94.37 | 0.00 | 0.00 | 0.6 | 17 |
| 4 | 1950 | 1 | 25 | 1950-01-25 | IL | 2 | 0 | 0 | 41.17 | -87.33 | 0.00 | 0.00 | 0.1 | 100 |
#Renaming the columns for our convenience
df.rename(columns = {'fat':'fatalities', 'len':'length', 'st':'state','inj':'injuries'}, inplace = True)
df.head()
| yr | mo | dy | date | state | mag | injuries | fatalities | slat | slon | elat | elon | length | wid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 1 | 3 | 1950-01-03 | IL | 3 | 3 | 0 | 39.10 | -89.30 | 39.12 | -89.23 | 3.6 | 130 |
| 1 | 1950 | 1 | 3 | 1950-01-03 | MO | 3 | 3 | 0 | 38.77 | -90.22 | 38.83 | -90.03 | 9.5 | 150 |
| 2 | 1950 | 1 | 3 | 1950-01-03 | OH | 1 | 1 | 0 | 40.88 | -84.58 | 0.00 | 0.00 | 0.1 | 10 |
| 3 | 1950 | 1 | 13 | 1950-01-13 | AR | 3 | 1 | 1 | 34.40 | -94.37 | 0.00 | 0.00 | 0.6 | 17 |
| 4 | 1950 | 1 | 25 | 1950-01-25 | IL | 2 | 0 | 0 | 41.17 | -87.33 | 0.00 | 0.00 | 0.1 | 100 |
df.shape
(67558, 14)
df.isna().sum()
yr 0 mo 0 dy 0 date 0 state 0 mag 0 injuries 0 fatalities 0 slat 0 slon 0 elat 0 elon 0 length 0 wid 0 dtype: int64
df.dtypes
yr int64 mo int64 dy int64 date object state object mag int64 injuries int64 fatalities int64 slat float64 slon float64 elat float64 elon float64 length float64 wid int64 dtype: object
#Checking the unique values for mag columns.The values of -9 indicates the unknown values of tornadoes EF rating
df['mag'].unique()
array([ 3, 1, 2, 4, 0, 5, -9], dtype=int64)
df['mag'].value_counts()
0 31375 1 22885 2 9517 3 2536 -9 605 4 581 5 59 Name: mag, dtype: int64
#Grouping by year
df1 = df.groupby('yr')['yr'].count()
plt.plot(df1)
plt.xlabel('Year')
plt.ylabel('Number of Tornadoes')
plt.title('Tornadoes in the US',fontweight='bold')
#plt.text(1, 1, 'testing captions', ha='center')
Text(0.5, 1.0, 'Tornadoes in the US')
df2 = df.groupby('state')['state'].count().reset_index(name = 'counts').sort_values(['counts'],ascending = False)
df2.head(10).plot(x = 'state', kind = 'bar')
plt.xlabel('State')
plt.ylabel('Number of tornadoes')
plt.title('Top 10 states with the highest number of tornadoes',fontweight='bold')
Text(0.5, 1.0, 'Top 10 states with the highest number of tornadoes')
fig = px.choropleth(df2,
locations = 'state',
locationmode = 'USA-states',
color = 'counts',
color_continuous_scale = 'REDS',
scope = 'usa',
)
fig.update_layout(title_text='Count of tornadoes across the states in the US.<br>(Hover for statewise counts)')
fig.show()
df_months_count = df.groupby('mo')['mo'].count().reset_index(name = 'monthly_tornadoes')
df_months_count['mo'] = df_months_count['mo'].apply(lambda x: calendar.month_abbr[x])
df_months_count.plot(x = 'mo', kind = 'bar')
plt.xlabel('Months')
plt.ylabel('Number of tornadoes')
plt.title('Number of tornadoes per month throughout the US',fontweight='bold')
Text(0.5, 1.0, 'Number of tornadoes per month throughout the US')
print('The total number of fatalities as a result of tornadoes:',df.fatalities.sum())
The total number of fatalities as a result of tornadoes: 6112
fatalities = df.groupby('state')['fatalities'].sum().reset_index().sort_values('fatalities', ascending = False)
fatalities.head(10).plot(x = 'state', kind = 'bar')
plt.xlabel('State')
plt.ylabel('Fatalities')
plt.title('Top 10 states with the highest fatalities',fontweight='bold')
Text(0.5, 1.0, 'Top 10 states with the highest fatalities')
ef_ratings = df.iloc[np.where(~df.mag.isin([-9]))]
ef_ratings = ef_ratings.groupby('mag')['fatalities'].mean().reset_index()
ax = ef_ratings.plot(x = 'mag',y ='fatalities',kind = 'bar')
ax.set_xlabel("F or EF Rating")
ax.set_ylabel("Mean fatailies")
ax.set_title('Mean fatalities for each EF rating',fontweight='bold')
Text(0.5, 1.0, 'Mean fatalities for each EF rating')
What is an EF scale?\ The Enhanced Fujita scale (abbreviated as EF-Scale) rates tornado intensity based on the severity of the damage they cause.
fats_per_yr = df.groupby(['state','yr'])['fatalities'].sum().reset_index()
fig = px.choropleth(fats_per_yr,
locations = "state",
locationmode="USA-states",
scope="usa",
color = 'fatalities' ,
labels={'yr':'Year','state':'State','fatalities':'Deaths'},
color_continuous_scale= 'YlOrBr',animation_frame = "yr")
fig.update_layout(title = "Tornado Fatalities in States Per Year")
fig.show()
df_fat = df.groupby('yr')['fatalities'].count().reset_index()
df_fat.plot('yr', 'fatalities')
plt.xlabel('Year')
plt.ylabel('No of fatalities')
plt.title('Fatalities over the years',fontweight = 'bold')
Text(0.5, 1.0, 'Fatalities over the years')
state_fatalities = df.groupby('state')['fatalities'].sum().reset_index()
fig = px.choropleth(state_fatalities,
locations = 'state',
locationmode = 'USA-states',
color = 'fatalities',
color_continuous_scale = 'ORRD',
scope = 'usa',
)
fig.update_layout(title_text='Count of tornadoes across the states in the US.<br>(Hover for statewise counts)')
fig.show()